The data source for this lab is a set of 31 CSV files containing 15-minute vehicle counts from January 2013, which is the same as lab03. The rows within these CSV files are similar to the rows in the CSV file from Lab Zero and Lab 01. They are stored in the VMs at path: /course/NZTA/.
system.time({
countsDF <- read.csv('/course/NZTA/20130101_20130331_TMSTrafficQuarterHour.csv')
})
## user system elapsed
## 21.633 0.968 22.612
Yes, read.csv() runs faster if it does not have to read data types.
system.time({
# adding colClasses as a parameter to reduce time taken by read.csv()
countsDF <- read.csv('/course/NZTA/20130101_20130331_TMSTrafficQuarterHour.csv', header=TRUE, colClasses = c("character", "character", "character", "character", "integer", "numeric"))
})
## user system elapsed
## 14.354 0.487 14.848
Read csv performed much faster this time, time elapsed went from ~20s to ~15s (Results vary per run)
system.time({
countsDT = data.table::fread('/course/NZTA/20130101_20130331_TMSTrafficQuarterHour.csv')
})
## user system elapsed
## 26.527 0.336 4.133
Data tables read the data much faster than dataframes, even when colClasses was defined.
library(profvis)
# provided code
p1 <- profvis({
countsDF$day <- as.Date(countsDF$startDatetime, format="%d-%b-%Y")
dailyCountsDF <- aggregate(countsDF["count"],
countsDF[c("day", "siteRef", "class")],
sum)
})
p1
the aggregate.data.frame and as.date.character functions took the longest time.
Total time ~ 50000ms
p2 <- profvis({
dailyCountsDT <- countsDT[, day := as.Date(startDatetime, format="%d-%b-%Y")][, sum(count), .(day, siteRef, class)]
})
p2
the as.date.character function took the most time
Total time ~ 8000ms
p3 <- profvis({
# changing the day column to include character data types
countsDF$day <- data.frame(do.call('rbind', strsplit(as.character(countsDF$startDatetime),' ',fixed=TRUE)))$X1
dailyCountsDF <- aggregate(countsDF["count"],
countsDF[c("day", "siteRef", "class")],
sum)
})
p3
By changing the date to a character data type, the run time was reduced to about ~ 25000 ms
p4 <- profvis({
dailyCountsDT <- countsDT[, day := data.frame(do.call('rbind', strsplit(as.character(countsDF$startDatetime),' ',fixed=TRUE)))$X1][, sum(count), .(day, siteRef, class)]
})
p4
By changing the date to character data type, we increase the speed to ~ 7000ms
dim(dailyCountsDF)
## [1] 72157 4
dim(dailyCountsDT)
## [1] 72157 4
They both give the result: 72157 rows, and 4 columns
head(dailyCountsDF[order(-dailyCountsDF$count),])
## day siteRef class count
## 45594 28-MAR-2013 01N10431 L 99129
## 45552 14-FEB-2013 01N10431 L 99035
## 45575 22-FEB-2013 01N10431 L 98618
## 50002 28-MAR-2013 01N29424 L 98540
## 45572 21-FEB-2013 01N10431 L 97987
## 45514 01-FEB-2013 01N10431 L 97696
head(dailyCountsDT[order(-dailyCountsDT$V1),])
## day siteRef class V1
## 1: 28-MAR-2013 01N10431 L 99129
## 2: 14-FEB-2013 01N10431 L 99035
## 3: 22-FEB-2013 01N10431 L 98618
## 4: 28-MAR-2013 01N29424 L 98540
## 5: 21-FEB-2013 01N10431 L 97987
## 6: 01-FEB-2013 01N10431 L 97696
Both heads are the same when sorted by count (descending). They also have the same amount of rows and columns, so we can assume they are the exact same results.
dailyCountsDF$scount <- sqrt(dailyCountsDF$count)
head(dailyCountsDF)
## day siteRef class count scount
## 1 01-JAN-2013 00200002 H 737.5 27.15695
## 2 01-MAR-2013 00200002 H 2370.0 48.68265
## 3 02-JAN-2013 00200002 H 1214.5 34.84968
## 4 02-MAR-2013 00200002 H 1209.5 34.77787
## 5 03-JAN-2013 00200002 H 1757.0 41.91658
## 6 03-MAR-2013 00200002 H 1094.0 33.07567
# For each chunk, use the chunk as a test set and the other nine chunks as the training set and fit a model using the training data to predict scount from the overall mean of scount
p5 <- profvis({
set.seed(123456789)
f1 <- function(x){
index <- sample(rep(1:10, length.out=nrow(x)))
RMSE <- numeric(10)
for (k in 1:10){
test <- x[index == k,]
train <- x[index != k,]
obs <- test$scount
predMean <- mean(train$scount)
RMSE[k]<-sqrt(mean((obs-predMean)^2))}
sum(RMSE)/10
}
f1(dailyCountsDF)
})
# For each chunk, use the chunk as a test set and the other nine chunks as the training set and fit the model using the training data to predict scount from class
p6 <- profvis({
set.seed(123456789)
f2 <- function(x){
index <- sample(rep(1:10, length.out=nrow(x)))
RMSE <- numeric(10)
for (k in 1:10){
test <- x[index == k,]
train <- x[index != k,]
obs <- test$scount
lmfit<- lm(scount ~ class, train)
predLM <- predict(lmfit, test)
RMSE[k]<-sqrt(mean((obs-predLM)^2))}
sum(RMSE)/10
}
f2(dailyCountsDF)
})
# profiling for the first model (scount from mean of scount)
p5
the lines which took longest to run are:
train <- x[index != k,] took the longest time at ~70s
index <- sample(rep(1:10, length.out=nrow(x))) and test <- x[index == k,] each took ~10s to complile
# profiling from second model (scount from class)
p6
The lines which took longest to run (in order) are:
train <- x[index != k,] (~70s)
lmfit<- lm(scount ~ class, train) (~50s)
test <- x[index == k,] (~20s)
# modelling on ALL data (dailyCountsDF)
index <- sample(rep(1:10, length.out=nrow(dailyCountsDF)))
train <- dailyCountsDF[index > 1, ]
test <- dailyCountsDF[index == 1, ]
obs <- test$scount
predMean <- mean(train$scount)
lmfit <- lm(scount ~ class, dailyCountsDF)
predLM <- predict(lmfit, dailyCountsDF)
# plotting both models on all of the data (dailyCountsDF)
plot(scount ~ jitter(as.numeric(factor(class))), dailyCountsDF,
xlab="class", axes=FALSE)
axis(2)
axis(1, at=as.numeric(unique(factor(dailyCountsDF$class))),
label=unique(factor(dailyCountsDF$class)))
abline(h=predMean, col="green")
points(as.numeric(unique(factor(dailyCountsDF$class))),
predict(lmfit, data.frame(class=unique(factor(dailyCountsDF$class)))),
pch=16, col="red")
In this lab, we used csv file data from the directory in our virtual machines at /course/NZTA. We imported the first csv and calculated how long it took to run using system.time(). We then used methods to improve the speed of read.csv() by ading the colClass parameter and compared it to a data.table.
We then used profiling to see the breakdown of time spent within tasks for read.csv() and data.table::fread. We made these methods faster by changing the data types (date to character) and comapred results.
For modelling, we used a loop function which loops for (k in 1:10) split the data into 10 chunks for 10-fold training and testing. A profiling breakdown showed which lines took the longest to compile.